Load Libraries

#install.packages("pacman")

pacman::p_load(tidyverse, BiocManager, devtools, dada2, 
               phyloseq, patchwork, DT, iNEXT, vegan,
               install = FALSE)

Goals of this file

  1. Use raw fastq files and generate quality plots to assess quality of reads.
  2. Filter and trim out bad sequences and bases from our sequencing files.
  3. Write out fastq files with high quality sequences.
  4. Evaluate the quality from our filter and trim.
  5. Infer errors on forward and reverse reads individually.
  6. Identified ASVs on forward and reverse reads seperately using the error model.
  7. Merge forward and reverse ASVs into “contiguous ASVs”.
  8. Generate the ASV count table (otu_table input for phyloseq).
  9. Remove chimeras.
  10. Track the read counts
  11. Assign taxonomy.
  12. Prepare the data for export!

Output that we need:

  1. ASV Count Table: otu_table
  2. Taxonomy Table: tax_table
  3. Sample Information: sample_data - track the reads lost throughout the DADA2 workflow

Load Data

# Set the raw fastq path to the raw sequencing files 
setwd("/local/workdir/sna49/moon_milk")
# Path to the fastq files 
raw_fastqs_path <- "/local/workdir/sna49/moon_milk/fastq_files"
raw_fastqs_path
## [1] "/local/workdir/sna49/moon_milk/fastq_files"
# What files are in this path? Intuition Check 
head(list.files(raw_fastqs_path))
## [1] "ERR11588428_1.fastq.gz" "ERR11588428_2.fastq.gz" "ERR11588429_1.fastq.gz"
## [4] "ERR11588429_2.fastq.gz" "ERR11588430_1.fastq.gz" "ERR11588430_2.fastq.gz"
# How many files are there? 
str(list.files(raw_fastqs_path))
##  chr [1:20] "ERR11588428_1.fastq.gz" "ERR11588428_2.fastq.gz" ...
# Create vector of forward reads
forward_reads <- list.files(raw_fastqs_path, pattern = "_1.fastq.gz", full.names = TRUE)  
# Intuition Check 
head(forward_reads)  
## [1] "/local/workdir/sna49/moon_milk/fastq_files/ERR11588428_1.fastq.gz"
## [2] "/local/workdir/sna49/moon_milk/fastq_files/ERR11588429_1.fastq.gz"
## [3] "/local/workdir/sna49/moon_milk/fastq_files/ERR11588430_1.fastq.gz"
## [4] "/local/workdir/sna49/moon_milk/fastq_files/ERR11588431_1.fastq.gz"
## [5] "/local/workdir/sna49/moon_milk/fastq_files/ERR11588432_1.fastq.gz"
## [6] "/local/workdir/sna49/moon_milk/fastq_files/ERR11588433_1.fastq.gz"
# Create a vector of reverse reads 
reverse_reads <- list.files(raw_fastqs_path, pattern = "_2.fastq.gz", full.names = TRUE)
head(reverse_reads)
## [1] "/local/workdir/sna49/moon_milk/fastq_files/ERR11588428_2.fastq.gz"
## [2] "/local/workdir/sna49/moon_milk/fastq_files/ERR11588429_2.fastq.gz"
## [3] "/local/workdir/sna49/moon_milk/fastq_files/ERR11588430_2.fastq.gz"
## [4] "/local/workdir/sna49/moon_milk/fastq_files/ERR11588431_2.fastq.gz"
## [5] "/local/workdir/sna49/moon_milk/fastq_files/ERR11588432_2.fastq.gz"
## [6] "/local/workdir/sna49/moon_milk/fastq_files/ERR11588433_2.fastq.gz"

Assess Raw Read Quality

Evaluate raw sequence quality

Let’s see the quality of the raw reads before we trim

Plot 12 random samples of plots

# Randomly select 12 samples from dataset to evaluate 
# we only have 20 reads total so we will be doing 2 random samples 
random_samples <- sample(1:length(reverse_reads), size = 2)
random_samples
## [1] 3 1
# Calculate and plot quality of these two samples
forward_filteredQual_plot_2 <- plotQualityProfile(forward_reads[random_samples]) + 
  labs(title = "Forward Read: Raw Quality")

reverse_filteredQual_plot_2 <- plotQualityProfile(reverse_reads[random_samples]) + 
  labs(title = "Reverse Read: Raw Quality")


# Plot them together with patchwork
forward_filteredQual_plot_2 + reverse_filteredQual_plot_2

## error: Error in Ops.data.frame(guide_loc, panel_loc) : 
  # ‘==’ only defined for equally-sized data frames

#the single run of forward and reverse runs the but combined does not want to aggregate idk WHY

Aggregated Raw Quality Plots

# Aggregate all QC plots 
# Forward reads
forward_preQC_plot <- 
  plotQualityProfile(forward_reads, aggregate = TRUE) + 
  labs(title = "Forward Pre-QC")
#show the plot
forward_preQC_plot

# reverse reads
reverse_preQC_plot <- 
  plotQualityProfile(reverse_reads, aggregate = TRUE) + 
  labs(title = "Reverse Pre-QC")
#show the plot
reverse_preQC_plot

## error: Error in Ops.data.frame(guide_loc, panel_loc) : 
  # ‘==’ only defined for equally-sized data frames
#aggregated plot 
preQC_aggregate_plot <- # Plot the forward and reverse together 
forward_preQC_plot + reverse_preQC_plot

#show plot
preQC_aggregate_plot

# Show the plot
preQC_aggregate_plot

[Insert some interpretation regarding the quality of the raw QC plots]

Here, we see that the plots are showing the standard Illumina output: The quality is higher at the beginning of the read and slowly gets worse and worse as the read progresses. This is typical of Illumina sequencing because of phasing. We also see that there’s a slightly lower quality in the reverse reads due to the less efficient chemistry.

Note: The first few bases at the beginning of the forward reads have a VERY LOW quality base. Take a look at the multiQC report to explore the data and confirm what you see here in the dada2 plot.

Prepare a placeholder for filtered reads

# vector of our samples, extract sample name from files 
samples <- sapply(strsplit(basename(forward_reads), "_"), `[`,1) 
# Intuition Check 
head(samples)
## [1] "ERR11588428" "ERR11588429" "ERR11588430" "ERR11588431" "ERR11588432"
## [6] "ERR11588433"
# Place filtered reads into filtered_fastqs_path
filtered_fastqs_path <- "/local/workdir/sna49/moon_milk/moonmilk/data/fastq_files"
filtered_fastqs_path
## [1] "/local/workdir/sna49/moon_milk/moonmilk/data/fastq_files"
# create 2 variables: filtered_F, filtered_R
filtered_forward_reads <- 
  file.path(filtered_fastqs_path, paste0(samples, "_R1_filtered.fastq.gz"))
length(filtered_forward_reads)
## [1] 10
# reverse reads
filtered_reverse_reads <- 
  file.path(filtered_fastqs_path, paste0(samples, "_R2_filtered.fastq.gz"))
head(filtered_forward_reads)
## [1] "/local/workdir/sna49/moon_milk/moonmilk/data/fastq_files/ERR11588428_R1_filtered.fastq.gz"
## [2] "/local/workdir/sna49/moon_milk/moonmilk/data/fastq_files/ERR11588429_R1_filtered.fastq.gz"
## [3] "/local/workdir/sna49/moon_milk/moonmilk/data/fastq_files/ERR11588430_R1_filtered.fastq.gz"
## [4] "/local/workdir/sna49/moon_milk/moonmilk/data/fastq_files/ERR11588431_R1_filtered.fastq.gz"
## [5] "/local/workdir/sna49/moon_milk/moonmilk/data/fastq_files/ERR11588432_R1_filtered.fastq.gz"
## [6] "/local/workdir/sna49/moon_milk/moonmilk/data/fastq_files/ERR11588433_R1_filtered.fastq.gz"
head(filtered_reverse_reads)
## [1] "/local/workdir/sna49/moon_milk/moonmilk/data/fastq_files/ERR11588428_R2_filtered.fastq.gz"
## [2] "/local/workdir/sna49/moon_milk/moonmilk/data/fastq_files/ERR11588429_R2_filtered.fastq.gz"
## [3] "/local/workdir/sna49/moon_milk/moonmilk/data/fastq_files/ERR11588430_R2_filtered.fastq.gz"
## [4] "/local/workdir/sna49/moon_milk/moonmilk/data/fastq_files/ERR11588431_R2_filtered.fastq.gz"
## [5] "/local/workdir/sna49/moon_milk/moonmilk/data/fastq_files/ERR11588432_R2_filtered.fastq.gz"
## [6] "/local/workdir/sna49/moon_milk/moonmilk/data/fastq_files/ERR11588433_R2_filtered.fastq.gz"

Filter and Trim Reads

Parameters of filter and trim DEPEND ON THE DATASET. The things to keep in mind are:
- The library preparation: Are the primers included in the sequence? If so, they need to be trimmed out in this step.
- What do the above quality profiles of the reads look like? If they are lower quality, it is highly recommended to use maxEE = c(1,1).
- Do the reads dip suddenly in their quality? If so, explore trimLeft and truncLen

Check out more of the parameters using ?filterAndTrim to bring up the help page and do some googling about it. Some notes on two examples are below, with a description of a few of the parameters:

  1. moon milk: https://link.springer.com/article/10.1007/s00248-023-02286-8#Sec13 this is a paper talking about moonmilk samples This salinity gradient dataset was generated with the library preparation described by Kozich et al., 2013 AEM, the reads maintained high Phred Scores (above 30, even more typically above ~34) all the way through to the end of the sequence. Therefore, we will truncate the data for this dataset and we will use a stringent maxEE = c(1,1). We dont need to trim left. But we will truncate reads after 250bp as per the graph previoulsy produced in the pre-qc step
# Assign a vector to filtered reads 
# trim out poor bases, first 3 bps on F reads
# write out filtered fastq files 
# Here, in this class dataset, the Kozich et al.(2013) AEM
      # Link to paper: https://doi.org/10.1128/AEM.01043-13
# Therefore, we do not need to trim the primers, because they were not sequenced
filtered_reads <- 
  filterAndTrim(fwd = forward_reads, filt = filtered_forward_reads,
              rev = reverse_reads, filt.rev = filtered_reverse_reads,
              maxN = 0, maxEE = c(1,1), 
              orient.fwd = "CCTACGGG",
              truncLen = 250, rm.phix = TRUE, compress = TRUE, multithread = TRUE)

# output
#                        reads.in reads.out
# ERR11588428_1.fastq.gz    96845     58385
# ERR11588429_1.fastq.gz   101519     58720
# ERR11588430_1.fastq.gz    95676     57401
# ERR11588431_1.fastq.gz    90880     55342
# ERR11588432_1.fastq.gz    85608     53582
# ERR11588433_1.fastq.gz    73835     45215
# ERR11588434_1.fastq.gz    67096     36726
# ERR11588435_1.fastq.gz    73421     38971
# ERR11588436_1.fastq.gz    83439     35047
# ERR11588437_1.fastq.gz    62048     33571

Assess Trimmed Read Quality

# Plot the 12 random samples after QC
forward_filteredQual_plot_2 <- 
  plotQualityProfile(filtered_forward_reads[random_samples]) + 
  labs(title = "Trimmed Forward Read Quality")
forward_filteredQual_plot_2

reverse_filteredQual_plot_2 <- 
  plotQualityProfile(filtered_reverse_reads[random_samples]) + 
  labs(title = "Trimmed Reverse Read Quality")
reverse_filteredQual_plot_2

# Put the two plots together 
forward_filteredQual_plot_2 + reverse_filteredQual_plot_2

Aggregated Trimmed Plots

# Aggregate all QC plots 
# Forward reads
forward_postQC_plot <- 
  plotQualityProfile(filtered_forward_reads, aggregate = TRUE) + 
  labs(title = "Forward Post-QC")
forward_postQC_plot

# reverse reads
reverse_postQC_plot <- 
  plotQualityProfile(filtered_reverse_reads, aggregate = TRUE) + 
  labs(title = "Reverse Post-QC")
reverse_postQC_plot

postQC_aggregate_plot <- 
  # Plot the forward and reverse together 
  forward_postQC_plot + reverse_postQC_plot
postQC_aggregate_plot

# Show the plot
postQC_aggregate_plot

Stats on read output from filterAndTrim

# Make output into dataframe 
filtered_df <- as.data.frame(filtered_reads)
head(filtered_df)
##                        reads.in reads.out
## ERR11588428_1.fastq.gz    96845     57420
## ERR11588429_1.fastq.gz   101519     57834
## ERR11588430_1.fastq.gz    95676     56552
## ERR11588431_1.fastq.gz    90880     51923
## ERR11588432_1.fastq.gz    85608     50841
## ERR11588433_1.fastq.gz    73835     42323
# 
#                        reads.in reads.out
# ERR11588428_1.fastq.gz    96845     58385
# ERR11588429_1.fastq.gz   101519     58720
# ERR11588430_1.fastq.gz    95676     57401
# ERR11588431_1.fastq.gz    90880     55342
# ERR11588432_1.fastq.gz    85608     53582
# ERR11588433_1.fastq.gz    73835     45215
# calculate some stats 
filtered_df %>%
  reframe(median_reads_in = median(reads.in),
          median_reads_out = median(reads.out),
          median_percent_retained = (median(reads.out)/median(reads.in)))
##   median_reads_in median_reads_out median_percent_retained
## 1         84523.5            46582                0.551113
#   median_reads_in median_reads_out median_percent_retained
# 1         84523.5          49398.5               0.5844351

We retained about 58.4% of reads which is alright. I think our filter and trim parameters are ok for this situation I feel better about having more stringently processed data and I feel like the quality is good enough

Visualize QC differences in plot

# Plot the pre and post together in one plot
preQC_aggregate_plot / postQC_aggregate_plot

Error Modelling

Note every sequencing run needs to be run separately! The error model MUST be run separately on each Illumina dataset. This is because every Illumina run is different, even if the flow cell and DNA/samples are the same. If you’d like to combine the datasets from multiple Illumina sequencing runs, you’ll need to do the exact same filterAndTrim() step AND, very importantly, you’ll need to have the same primer/ASV length/16S location expected by the output.

But wait: what contributes to sequencing error in different sequencing runs and why do we need to model errors separately per run with learnErrors() in dada2? Remember the core principles of how Illumina seuqencing works! Some things that contribute to this are:

-Different timings for when clusters go out of sync (drop in quality at end of reads that’s typical of Illumina sequencing) - The cluster density is impossible to exactly replicate. Therefore, the cluster density (and therefore sequence quality) will always be different between sequencing runs (even if it’s the same person/samples/sequencing facility!). -PhiX spike-in will also vary between runs, even if we try to make it the same! Therefore, the amount of heterogeneity on the flow cell will also be different, impacting the quality.
-Different locations on the flow cell can be impacted differently between runs. Perhaps an air bubble can get in, the cluster density happened to be higher/lower on a different run/flow cell.

Ok, that said. Let’s now infer error rates for all possible transitions within purines and pyrimidines (A<>G or C<>T) and transversions between all purine and pyrimidine combinations. The error model is learned by alternating estimation of the error rates and inference of sample composition until they converge. This specifically:

  1. Starts with the assumption that the error rates are the maximum (takes the most abundant sequence (“center”) and assumes it’s the only sequence not caused by errors).
  2. Compares the other sequences to the most abundant sequence.
  3. Uses at most 108 nucleotides for the error estimation.
  4. Uses parametric error estimation function of loess fit on the observed error rates.

Learn the errors

# Forward reads 
error_forward_reads <- 
  learnErrors(filtered_forward_reads, multithread = TRUE)
## 106429000 total bases in 425716 reads from 9 samples will be used for learning the error rates.
#101085500 total bases in 404342 reads from 8 samples will be used for learning the error rates

# Plot Forward  
forward_error_plot <- 
  plotErrors(error_forward_reads, nominalQ = TRUE) + 
  labs(title = "Forward Read Error Model")
forward_error_plot
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## log-10 transformation introduced infinite values.

# Reverse reads 
error_reverse_reads <- 
  learnErrors(filtered_reverse_reads, multithread = TRUE)
## 106429000 total bases in 425716 reads from 9 samples will be used for learning the error rates.
# Plot reverse
reverse_error_plot <- 
  plotErrors(error_reverse_reads, nominalQ = TRUE) + 
  labs(title = "Reverse Read Error Model")
# Plot reverse
reverse_error_plot
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.

# Put the two plots together
forward_error_plot + reverse_error_plot
## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## log-10 transformation introduced infinite values.
## log-10 transformation introduced infinite values.

our data quality is meh but alright overall it drops in quality

  • The error rates for each possible transition (A→C, A→G, …) are shown in the plot above.

Details of the plot: - Points: The observed error rates for each consensus quality score.
- Black line: Estimated error rates after convergence of the machine-learning algorithm.
- Red line: The error rates expected under the nominal definition of the Q-score.

Similar to what is mentioned in the dada2 tutorial: the estimated error rates (black line) are a “reasonably good” fit to the observed rates (points), and the error rates drop with increased quality as expected. We can now infer ASVs!

Infer ASVs

An important note: This process occurs separately on forward and reverse reads! This is quite a different approach from how OTUs are identified in Mothur and also from UCHIME, oligotyping, and other OTU, MED, and ASV approaches.

# Infer ASVs on the forward sequences
dada_forward <- dada(filtered_forward_reads,
                     err = error_forward_reads, 
                     multithread = TRUE)
## Sample 1 - 57420 reads in 23619 unique sequences.
## Sample 2 - 57834 reads in 25770 unique sequences.
## Sample 3 - 56552 reads in 25006 unique sequences.
## Sample 4 - 51923 reads in 5567 unique sequences.
## Sample 5 - 50841 reads in 6552 unique sequences.
## Sample 6 - 42323 reads in 4624 unique sequences.
## Sample 7 - 35990 reads in 7474 unique sequences.
## Sample 8 - 38075 reads in 7817 unique sequences.
## Sample 9 - 34758 reads in 13517 unique sequences.
## Sample 10 - 33283 reads in 6707 unique sequences.
typeof(dada_forward)
## [1] "list"
# Grab a sample and look at it 
dada_forward$`ERR11588437_R1_filtered.fastq.gz`
## dada-class: object describing DADA2 denoising results
## 472 sequence variants were inferred from 6707 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
# Infer ASVs on the reverse sequences 
dada_reverse <- dada(filtered_reverse_reads,
                     err = error_reverse_reads,
                     multithread = TRUE)
## Sample 1 - 57420 reads in 27514 unique sequences.
## Sample 2 - 57834 reads in 29660 unique sequences.
## Sample 3 - 56552 reads in 28339 unique sequences.
## Sample 4 - 51923 reads in 8981 unique sequences.
## Sample 5 - 50841 reads in 9106 unique sequences.
## Sample 6 - 42323 reads in 7484 unique sequences.
## Sample 7 - 35990 reads in 11539 unique sequences.
## Sample 8 - 38075 reads in 12019 unique sequences.
## Sample 9 - 34758 reads in 15190 unique sequences.
## Sample 10 - 33283 reads in 9152 unique sequences.
dada_reverse$`ERR11588435_R2_filtered.fastq.gz`
## dada-class: object describing DADA2 denoising results
## 518 sequence variants were inferred from 12019 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
# Inspect 
dada_reverse[1]
## $ERR11588428_R2_filtered.fastq.gz
## dada-class: object describing DADA2 denoising results
## 1088 sequence variants were inferred from 27514 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
dada_reverse[10]
## $ERR11588437_R2_filtered.fastq.gz
## dada-class: object describing DADA2 denoising results
## 456 sequence variants were inferred from 9152 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16

Merge Forward & Reverse ASVs

Now, merge the forward and reverse ASVs into contigs.

# merge forward and reverse ASVs
merged_ASVs <- mergePairs(dada_forward, filtered_forward_reads, 
                          dada_reverse, filtered_reverse_reads,
                          verbose = TRUE)
## 39698 paired-reads (in 2956 unique pairings) successfully merged out of 49925 (in 11233 pairings) input.
## 37117 paired-reads (in 2817 unique pairings) successfully merged out of 48774 (in 12296 pairings) input.
## 37096 paired-reads (in 2889 unique pairings) successfully merged out of 48264 (in 11936 pairings) input.
## 50857 paired-reads (in 1239 unique pairings) successfully merged out of 51084 (in 1315 pairings) input.
## 49720 paired-reads (in 1471 unique pairings) successfully merged out of 49991 (in 1566 pairings) input.
## 41173 paired-reads (in 913 unique pairings) successfully merged out of 41620 (in 983 pairings) input.
## 33758 paired-reads (in 1448 unique pairings) successfully merged out of 34410 (in 1701 pairings) input.
## 35944 paired-reads (in 1475 unique pairings) successfully merged out of 36508 (in 1774 pairings) input.
## 26181 paired-reads (in 2511 unique pairings) successfully merged out of 30954 (in 5730 pairings) input.
## 30138 paired-reads (in 1082 unique pairings) successfully merged out of 30985 (in 1344 pairings) input.
# Evaluate the output 
typeof(merged_ASVs)
## [1] "list"
length(merged_ASVs)
## [1] 10
names(merged_ASVs)
##  [1] "ERR11588428_R1_filtered.fastq.gz" "ERR11588429_R1_filtered.fastq.gz"
##  [3] "ERR11588430_R1_filtered.fastq.gz" "ERR11588431_R1_filtered.fastq.gz"
##  [5] "ERR11588432_R1_filtered.fastq.gz" "ERR11588433_R1_filtered.fastq.gz"
##  [7] "ERR11588434_R1_filtered.fastq.gz" "ERR11588435_R1_filtered.fastq.gz"
##  [9] "ERR11588436_R1_filtered.fastq.gz" "ERR11588437_R1_filtered.fastq.gz"
# Inspect the merger data.frame 
head(merged_ASVs[[3]])
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                            sequence
## 1 CCTACGGGGGGCTGCAGTCGAGGATCTTTCGCAATGGGCGCAAGCCTGACGAAGCGACGCCGTGTGAGCGAAGAAGGCCTTCGGGTTGTAAAGCTCTTTCGCTTGGAAACAAGAAAAGCGGGTGAATAATCCGCTAATTTGAGGGTACCAGGTAAAGAAGCACCGGCTAACTCCGTGCCAGCAGCTGCGGTAATACGGAGGGTGCAAGCATTAATCGGATTTATTGGGCGTAAAGGGCGCGTAGGTGGTTTGGAAAGTCAGATGTGAAATTCCGGGGCTCAACTTCGGAGCGGCATCTGAAACTTCTAATCTAGAGGATAGGTGGGGAAAACGGAATTCCACATGTAGCGGTGAAATGCGTAGATATGTGGAAGAACACCGGTGGCGAAGGCGGTTTTCTAACCTACACCTGACACTGAGGCGCGAAAGCGTGGGGAGCAAACAGGATTAGATACCCCGGTAGTC
## 2 CCTACGGGGGGCAGCAGTGGGGAATATTGGACAATGGGGGCAACCCTGATCCAGCAATACCGCGTGTGTGAAGAAGGCCTTCGGGTTGTAAAGCACTTTAGGTTGGGGAAAAGATGTATAGTTTAAGAGATGATACATTTGATTGTACCAACAGAATAAGCACCGGCTAACTCTGTGCCAGCAGCCGCGGTAATACAGAGGGTGCAAGCGTTAATCGGAGTTACTGGGCGTAAAGGGCGCGTAGGTGGTGCATTAAGTGAGTTGTGAAAGACCTGGGCTTAACCTGGGAGGTGCAACTTATACTGATGTACTAGAGTACTGTAGAGGGTAGTGGAATTTCCGGTGTAGCGGTGAAATGCGTAGAGATCGGAAGGAACACCAGTGGCGAAGGCGGCTACCTGGACAGATACTGACACTGAGGCGCGAAAGCGTGGGGAGCAAACAGGATTAGATACCCCGGTAGTC
## 3                         CCTACGGGGGGCAGCAGTTAGGAATTTTGCGCAATGGGGGAAACCCTGACGCAGCCACGCCGCGTGCGGGATGAAGGCCCTCGGGTCGTAAACCGCTTTTCTCAGGGAAGATCCAAGACGGTACCTGAGGAATAAGCAACGGCTAACTACGTGCCAGCAGCCGCGGTAATACGTAGGTTGCGAGCGTTGTCCGGATTTACTGGGCGTAAAGCGCACGCAGGCGGTCTCGAAAGTCCTTCGTGAAAGCTCCCGGCTCAACTGGGAGAGGCCGATGGAAACTTCGGGACTTGAAGGCGGGAGAGGCAGATGGAATTCCCGGTGTAGCGGTAAAATGCGTAGATACCGGGAGGAACACCAGTGGCGAAAGCGATCTGCTGGCCCACCCTTGACGCTCATGTGCGAAAGCTAGGGGAGCGACCGGGATTAGATACCCCGGTAGTC
## 4                 CCTACGGGGGGCTGCAGTCGAGGATCTTCGGCAATGAGCGCAAGCTTGACCGAGCGACGCCGCGTGTGGGATGAAGGCCCTCGGGTTGTAAACCACTGTCGAGGGGGAGGAAAGCCGCAAGGTCTGACCTATCCCTGGAGGAAGCACGGGCTAAGTTCGTGCCAGCAGCCGCGGTAAGACGAACCGTGCGAACGTTGTTCGGAATCACTGGGCTTAAAGGGCGCGTAGGCAGTTCGTTAAGTTCAGGGTGAAATCTTTCGGCTCAACCGGAAGACTGCCCTGGATACTGGCGGGCTAGAGGGAGGTAGGGGCGACGGGAACTACCAGTGGAGCGGTGAAATGCGTTGATATTGGTAGGAACGCCGGTGGCGAAAGCGCGTCGCTGGACCTCTCCTGACGCTGAGGCGCGAAAGCTAGGGTAGCAAACGGGATTAGATACCCCGGTAGTC
## 5                          CCTACGGGGGGCAGCAGTCGGGAATTTTGGGCAATGGGGGAAACCCTGACCCAGCAACGCCGCGTGAAGGATGAAGTATTTCGGTATGTAAACTTCGAAAGAATAGGAAGAATAAATGACGGTACTATTTATAAGGTCCGGCTAACTACGTGCCAGCAGCCGCGGTAATACGTAGGGACCAAGCGTTGTTCGGATTTACTGGGCGTAAAGGGCGCGTAGGCGGCATGACAAGTCACTTGTGAAATCTCTGGGCTTAACCCAGAACGGCCAAGTGATACTGTCGCGCTAGAGTGCAGAAGGGGCAATCGGAATTCTTGGTGTAGCGGTGAAATGCGTAGATATCAAGAGGAACACCTGAGGTGAAGACGGGTTGCTGGGCTGACACTGACGCTGAGGCGCGAAAGCCAGGGGAGCAAACGGGATTAGATACCCCGGTAGTC
## 6                          CCTACGGGGGGCAGCAGTGGGGAATATTGGACAATGGGGGAAACCCTGATCCAGCAATGCCGCGTGAATGAAGAAGGCCTTAGGGTTGTAAAGTTCTTTCACCTGTGAAGATAATGACGGTAGCAGGAGAAGAAGCCCCGGCAAACTCCGTGCCAGCAGCCGCGGTAAGACGGAGGGGGCTAGCGTTGTTCGGAATGACTGGGCGTAAAGGGCGCGTAGGCGGTATTTTAAGTGAGGCGTGAAAGCCCCGGGCTTAACCCGGGAGGTGCGTTTCATACTGGAATGCTAGAGTGCGAGAGAGGAAAGTGGAATTCCTAGTGTAGAGGTGAAATTCGTAGATATTAGGAAGAACACCAGAGGCGAAGGCGGCTTTCTGGCTCGCAACTGACGCTGAGGCGCGAAAGCGTGGGGAGCAAACAGGATTAGATACCCCGGTAGTC
##   abundance forward reverse nmatch nmismatch nindel prefer accept
## 1       103      53      65     35         0      0      1   TRUE
## 2        98      99      62     35         0      0      1   TRUE
## 3        98      35      58     59         0      0      1   TRUE
## 4        95      69      87     51         0      0      1   TRUE
## 5        94     570     305     60         0      0      2   TRUE
## 6        93      58      59     60         0      0      1   TRUE

Create Raw ASV Count Table

# Create the ASV Count Table 
raw_ASV_table <- makeSequenceTable(merged_ASVs)

# Write out the file to data/01_DADA2


# Check the type and dimensions of the data
dim(raw_ASV_table)
## [1]    10 14324
class(raw_ASV_table)
## [1] "matrix" "array"
typeof(raw_ASV_table)
## [1] "integer"
# Inspect the distribution of sequence lengths of all ASVs in dataset 
table(nchar(getSequences(raw_ASV_table)))
## 
##  271  272  275  294  295  297  298  299  379  405  409  411  412  416  417  418 
##    1    8    1    6   10    6   26    2    1    1    1    2    3    1   10    1 
##  419  422  423  425  426  428  429  432  433  434  435  436  437  438  439  440 
##    3    7    4   40    1    2   71   11   16   41  102    3    1   58   26 1435 
##  441  442  443  444  445  446  447  448  449  450  451  452  453  454  455  456 
##  976  345  178   55 1464  255  214   12  252    7  111   60  137   74  209   25 
##  457  458  459  460  461  462  463  464  465  466  467  468  469  476  477  479 
##  202   49  111  420   43   57   87  324 4673 1968   90    4    4    2    7    3 
##  480  481  486 
##    3    1    1
# Inspect the distribution of sequence lengths of all ASVs in dataset 
# AFTER TRIM
data.frame(Seq_Length = nchar(getSequences(raw_ASV_table))) %>%
  ggplot(aes(x = Seq_Length )) + 
  geom_histogram() + 
  labs(title = "Raw distribution of ASV length")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

###################################################
###################################################
# TRIM THE ASVS
# Let's trim the ASVs to only be the right size, which is 249.
# 249 originates from our expected amplicon of 252 - 3bp in the forward read due to low quality.

# We will allow for a few 
raw_ASV_table_trimmed <- raw_ASV_table[,nchar(colnames(raw_ASV_table)) %in% 440:466]

# Inspect the distribution of sequence lengths of all ASVs in dataset 
table(nchar(getSequences(raw_ASV_table_trimmed)))
## 
##  440  441  442  443  444  445  446  447  448  449  450  451  452  453  454  455 
## 1435  976  345  178   55 1464  255  214   12  252    7  111   60  137   74  209 
##  456  457  458  459  460  461  462  463  464  465  466 
##   25  202   49  111  420   43   57   87  324 4673 1968
# What proportion is left of the sequences? 
sum(raw_ASV_table_trimmed)/sum(raw_ASV_table)
## [1] 0.9682537
# Inspect the distribution of sequence lengths of all ASVs in dataset 
# AFTER TRIM
data.frame(Seq_Length = nchar(getSequences(raw_ASV_table_trimmed))) %>%
  ggplot(aes(x = Seq_Length )) + 
  geom_histogram() + 
  labs(title = "Trimmed distribution of ASV length")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Let's zoom in on the plot 
data.frame(Seq_Length = nchar(getSequences(raw_ASV_table_trimmed))) %>%
  ggplot(aes(x = Seq_Length )) + 
  geom_histogram() + 
  labs(title = "Trimmed distribution of ASV length") + 
  scale_y_continuous(limits = c(0, 5500))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Taking into account the lower, zoomed-in plot. Do we want to remove those extra ASVs?

We will allow for a few

raw_ASV_table_trimmed <- raw_ASV_table[,nchar(colnames(raw_ASV_table)) %in% 440:466]

Inspect the distribution of sequence lengths of all ASVs in dataset

table(nchar(getSequences(raw_ASV_table_trimmed)))
## 
##  440  441  442  443  444  445  446  447  448  449  450  451  452  453  454  455 
## 1435  976  345  178   55 1464  255  214   12  252    7  111   60  137   74  209 
##  456  457  458  459  460  461  462  463  464  465  466 
##   25  202   49  111  420   43   57   87  324 4673 1968

What proportion is left of the sequences?

sum(raw_ASV_table_trimmed)/sum(raw_ASV_table)
## [1] 0.9682537

Inspect the distribution of sequence lengths of all ASVs in dataset

AFTER TRIM

data.frame(Seq_Length = nchar(getSequences(raw_ASV_table_trimmed))) %>%
  ggplot(aes(x = Seq_Length )) + 
  geom_histogram() + 
  labs(title = "Trimmed distribution of ASV length")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Note the peak at 249 is ABOVE 3000

# Let's zoom in on the plot 
data.frame(Seq_Length = nchar(getSequences(raw_ASV_table_trimmed))) %>%
  ggplot(aes(x = Seq_Length )) + 
  geom_histogram() + 
  labs(title = "Trimmed distribution of ASV length") + 
  scale_y_continuous(limits = c(0, 5500))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Remove Chimeras

Sometimes chimeras arise in our workflow.

Chimeric sequences are artificial sequences formed by the combination of two or more distinct biological sequences. These chimeric sequences can arise during the polymerase chain reaction (PCR) amplification step of the 16S rRNA gene, where fragments from different templates can be erroneously joined together.

Chimera removal is an essential step in the analysis of 16S sequencing data to improve the accuracy of downstream analyses, such as taxonomic assignment and diversity assessment. It helps to avoid the inclusion of misleading or spurious sequences that could lead to incorrect biological interpretations.

# Remove the chimeras in the raw ASV table
noChimeras_ASV_table <- removeBimeraDenovo(raw_ASV_table_trimmed, 
                                           method="consensus", 
                                           multithread=TRUE, verbose=TRUE)
## Identified 9209 bimeras out of 13743 input sequences.
# Check the dimensions
dim(noChimeras_ASV_table)
## [1]   10 4534
# What proportion is left of the sequences? 
sum(noChimeras_ASV_table)/sum(raw_ASV_table_trimmed)
## [1] 0.571369
sum(noChimeras_ASV_table)/sum(raw_ASV_table)
## [1] 0.5532302
# Plot it 
p1 <-
  data.frame(Seq_Length_NoChim = nchar(getSequences(noChimeras_ASV_table))) %>%
  ggplot(aes(x = Seq_Length_NoChim )) + 
  geom_histogram()+ 
  labs(title = "Trimmed + Chimera Removal distribution of ASV length")

p1
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Track the read counts

Here, we will look at the number of reads that were lost in the filtering, denoising, merging, and chimera removal.

# A little function to identify number seqs 
getN <- function(x) sum(getUniques(x))

# Make the table to track the seqs 
track <- cbind(filtered_reads, 
               sapply(dada_forward, getN),
               sapply(dada_reverse, getN),
               sapply(merged_ASVs, getN),
               rowSums(noChimeras_ASV_table))

head(track)
##                        reads.in reads.out                        
## ERR11588428_1.fastq.gz    96845     57420 53646 52928 39698 21236
## ERR11588429_1.fastq.gz   101519     57834 52927 52528 37117 20768
## ERR11588430_1.fastq.gz    95676     56552 51951 51872 37096 20697
## ERR11588431_1.fastq.gz    90880     51923 51671 51312 50857 29177
## ERR11588432_1.fastq.gz    85608     50841 50594 50229 49720 28614
## ERR11588433_1.fastq.gz    73835     42323 42137 41784 41173 24959
# Update column names to be more informative (most are missing at the moment!)
colnames(track) <- c("input", "filtered", "denoisedF", "denoisedR", "merged", "nochim")
rownames(track) <- samples

# Generate a dataframe to track the reads through our DADA2 pipeline
track_counts_df <- 
  track %>%
  # make it a dataframe
  as.data.frame() %>%
  rownames_to_column(var = "names") %>%
  mutate(perc_reads_retained = 100 * nochim / input)

# Visualize it in table format 
DT::datatable(track_counts_df)
# Plot it!
track_counts_df %>%
  pivot_longer(input:nochim, names_to = "read_type", values_to = "num_reads") %>%
  mutate(read_type = fct_relevel(read_type, 
                                 "input", "filtered", "denoisedF", "denoisedR", "merged", "nochim")) %>%
  ggplot(aes(x = read_type, y = num_reads, fill = read_type)) + 
  geom_line(aes(group = names), color = "grey") + 
  geom_point(shape = 21, size = 3, alpha = 0.8) + 
  scale_fill_brewer(palette = "Spectral") + 
  labs(x = "Filtering Step", y = "Number of Sequences") + 
  theme_bw()

Assign Taxonomy

Here, we will use the silva database version 138!

It is extremly important to note that our dataset was originally super weird. What fixed our dataset (thanks to Gus!!!!!) was to find reverse complimented ASVs in our dataset and write out the function as “tryRC = TRUE” and it made our dataset so much cleaner and not look insane! We now have acceptable plots

# Classify the ASVs against a reference set using the RDP Naive Bayesian Classifier described by Wang et al., (2007) in AEM
taxa_train <- 
  assignTaxonomy(noChimeras_ASV_table, 
                 "/workdir/in_class_data/taxonomy/silva_nr99_v138.1_train_set.fa.gz", 
                 tryRC = TRUE,
                 multithread=TRUE)

# Add the genus/species information 
taxa_addSpecies <- 
  addSpecies(taxa_train, 
             "/workdir/in_class_data/taxonomy/silva_species_assignment_v138.1.fa.gz")

# Inspect the taxonomy 
taxa_print <- taxa_addSpecies # Removing sequence rownames for display only
rownames(taxa_print) <- NULL
#View(taxa_print)

Prepare the data for export!

1. ASV Table

Below, we will prepare the following:

  1. Two ASV Count tables:
    1. With ASV seqs: ASV headers include the entire ASV sequence. We are using a dataset with V3-V4 region so we expect a biomodal curve for the number of sequences found (we will see this later on and will need to trim our ASVs accordingly)
    2. with ASV names: This includes re-written and shortened headers like ASV_1, ASV_2, etc, which will match the names in our fasta file below.
  2. ASV_fastas: A fasta file that we can use to build a tree for phylogenetic analyses (e.g. phylogenetic alpha diversity metrics or UNIFRAC dissimilarty).

Finalize ASV Count Tables

########### 2. COUNT TABLE ###############
############## Modify the ASV names and then save a fasta file!  ############## 
# Give headers more manageable names
# First pull the ASV sequences
asv_seqs <- colnames(noChimeras_ASV_table)
asv_seqs[1:5]
## [1] "CCTACGGGAGGCAGCAGTTGGGAATTTTGGGCAATGGGCGCAAGCCTGACCCAGCGACGCCGCGTGGGGGATGAAGGCCTTCGGGTCGTAAACCCCTGTCAGGAGGGAAGAAGCCCGCAAGGGTGACGGTACCTCCAGAGGAAGCCCCGGCTAACTACGTGCCAGCAGCCGCGGTAAGACGTAGGGGGCGAGCGTTGTCCGGATTCACTGGGCGTAAAGCGCGAGTAGGTGGCCTGTTAAGTGGCGTGTGAAAGCCTGCGGCTCAACCGTAGGAGGTCGCGCCAGACTGGCGGGCTTGAGGGTGGCAGAGGGTGATGGAATTCCCGGTGTAGCGGTGAAATGCGCAGATATCGGGAGGAACGCCAGTGGCGAAGGCGGTCACCTGGGCCATCCCTAACGCTGAGTCGCGAAAGCTGGGGGAGCGAACGGGATTAGATACCCCAGTAGTC"
## [2] "CCTACGGGAGGCAGCAGTTGGGAATTTTGGGCAATGGGCGCAAGCCTGACCCAGCGACGCCGCGTGGGGGATGAAGGCCTTCGGGTCGTAAACCCCTGTCAGGAGGGAAGAAGCCCGCAAGGGTGACGGTACCTCCAGAGGAAGCCCCGGCTAACTACGTGCCAGCAGCCGCGGTAAGACGTAGGGGGCGAGCGTTGTCCGGATTCACTGGGCGTAAAGCGCGAGTAGGTGGCCTGTTAAGTGGCGTGTGAAAGCCTGCGGCTCAACCGTAGGAGGTCGCGCCAGACTGGCGGGCTTGAGGGTGGCAGAGGGTGATGGAATTCCCGGTGTAGCGGTGAAATGCGCAGATATCGGGAGGAACGCCAGTGGCGAAGGCGGTCACCTGGGCCATCCCTAACGCTGAGTCGCGAAAGCTGGGGGAGCGAACGGGATTAGATACCCGAGTAGTC"
## [3] "CCTACGGGAGGCAGCAGTTGGGAATTTTGGGCAATGGGCGCAAGCCTGACCCAGCGACGCCGCGTGGGGGATGAAGGCCTTCGGGTCGTAAACCCCTGTCAGGAGGGAAGAAGCCCGCAAGGGTGACGGTACCTCCAGAGGAAGCCCCGGCTAACTACGTGCCAGCAGCCGCGGTAAGACGTAGGGGGCGAGCGTTGTCCGGATTCACTGGGCGTAAAGCGCGAGTAGGTGGCCTGTTAAGTGGCGTGTGAAAGCCTGCGGCTCAACCGTAGGAGGTCGCGCCAGACTGGCGGGCTTGAGGGTGGCAGAGGGTGATGGAATTCCCGGTGTAGCGGTGAAATGCGCAGATATCGGGAGGAACGCCAGTGGCGAAGGCGGTCACCTGGGCCATCCCTAACGCTGAGTCGCGAAAGCTGGGGGAGCGAACGGGATTAGATACCCCGGTAGTC"
## [4] "CCTACGGGAGGCAGCAGTTGGGAATTTTGGGCAATGGGCGCAAGCCTGACCCAGCGACGCCGCGTGGGGGATGAAGGCCTTCGGGTCGTAAACCCCTGTCAGGAGGGAAGAAGCCCGCAAGGGTGACGGTACCTCCAGAGGAAGCCCCGGCTAACTACGTGCCAGCAGCCGCGGTAAGACGTAGGGGGCGAGCGTTGTCCGGATTCACTGGGCGTAAAGCGCGAGTAGGTGGCCTGTTAAGTGGCGTGTGAAAGCCTGCGGCTCAACCGTAGGAGGTCGCGCCAGACTGGCGGGCTTGAGGGTGGCAGAGGGTGATGGAATTCCCGGTGTAGCGGTGAAATGCGCAGATATCGGGAGGAACGCCAGTGGCGAAGGCGGTCACCTGGGCCATCCCTAACGCTGAGTCGCGAAAGCTGGGGGAGCGAACGGGATTAGATACCCTAGTAGTC"
## [5] "CCTACGGGAGGCAGCAGTTGGGAATTTTGGGCAATGGGCGCAAGCCTGACCCAGCGACGCCGCGTGGGGGATGAAGGCCTTCGGGTCGTAAACCCCTGTCAGGAGGGAAGAAGCCCGCAAGGGTGACGGTACCTCCAGAGGAAGCCCCGGCTAACTACGTGCCAGCAGCCGCGGTAAGACGTAGGGGGCGAGCGTTGTCCGGATTCACTGGGCGTAAAGCGCGAGTAGGTGGCCTGTTAAGTGGCGTGTGAAAGCCTGCGGCTCAACCGTAGGAGGTCGCGCCAGACTGGCGGGCTTGAGGGTGGCAGAGGGTGATGGAATTCCCGGTGTAGCGGTGAAATGCGCAGATATCGGGAGGAACGCCAGTGGCGAAGGCGGTCACCTGGGCCATCCCTAACGCTGAGTCGCGAAAGCTGGGGGAGCGAACGGGATTAGATACCCCTGTAGTC"
# make headers for our ASV seq fasta file, which will be our asv names
asv_headers <- vector(dim(noChimeras_ASV_table)[2], mode = "character")
asv_headers[1:5]
## [1] "" "" "" "" ""
# loop through vector and fill it in with ASV names 
for (i in 1:dim(noChimeras_ASV_table)[2]) {
  asv_headers[i] <- paste(">ASV", i, sep = "_")
}

# intitution check
asv_headers[1:5]
## [1] ">ASV_1" ">ASV_2" ">ASV_3" ">ASV_4" ">ASV_5"
##### Rename ASVs in table then write out our ASV fasta file! 
#View(noChimeras_ASV_table)
asv_tab <- t(noChimeras_ASV_table)
#View(asv_tab)

## Rename our asvs! 
row.names(asv_tab) <- sub(">", "", asv_headers)
#View(asv_tab)

2. Taxonomy Table

# Inspect the taxonomy table
#View(taxa_addSpecies)

##### Prepare tax table 
# Add the ASV sequences from the rownames to a column 
new_tax_tab <- 
  taxa_addSpecies%>%
  as.data.frame() %>%
  rownames_to_column(var = "ASVseqs") 
head(new_tax_tab)
##                                                                                                                                                                                                                                                                                                                                                                                                                                                             ASVseqs
## 1 CCTACGGGAGGCAGCAGTTGGGAATTTTGGGCAATGGGCGCAAGCCTGACCCAGCGACGCCGCGTGGGGGATGAAGGCCTTCGGGTCGTAAACCCCTGTCAGGAGGGAAGAAGCCCGCAAGGGTGACGGTACCTCCAGAGGAAGCCCCGGCTAACTACGTGCCAGCAGCCGCGGTAAGACGTAGGGGGCGAGCGTTGTCCGGATTCACTGGGCGTAAAGCGCGAGTAGGTGGCCTGTTAAGTGGCGTGTGAAAGCCTGCGGCTCAACCGTAGGAGGTCGCGCCAGACTGGCGGGCTTGAGGGTGGCAGAGGGTGATGGAATTCCCGGTGTAGCGGTGAAATGCGCAGATATCGGGAGGAACGCCAGTGGCGAAGGCGGTCACCTGGGCCATCCCTAACGCTGAGTCGCGAAAGCTGGGGGAGCGAACGGGATTAGATACCCCAGTAGTC
## 2 CCTACGGGAGGCAGCAGTTGGGAATTTTGGGCAATGGGCGCAAGCCTGACCCAGCGACGCCGCGTGGGGGATGAAGGCCTTCGGGTCGTAAACCCCTGTCAGGAGGGAAGAAGCCCGCAAGGGTGACGGTACCTCCAGAGGAAGCCCCGGCTAACTACGTGCCAGCAGCCGCGGTAAGACGTAGGGGGCGAGCGTTGTCCGGATTCACTGGGCGTAAAGCGCGAGTAGGTGGCCTGTTAAGTGGCGTGTGAAAGCCTGCGGCTCAACCGTAGGAGGTCGCGCCAGACTGGCGGGCTTGAGGGTGGCAGAGGGTGATGGAATTCCCGGTGTAGCGGTGAAATGCGCAGATATCGGGAGGAACGCCAGTGGCGAAGGCGGTCACCTGGGCCATCCCTAACGCTGAGTCGCGAAAGCTGGGGGAGCGAACGGGATTAGATACCCGAGTAGTC
## 3 CCTACGGGAGGCAGCAGTTGGGAATTTTGGGCAATGGGCGCAAGCCTGACCCAGCGACGCCGCGTGGGGGATGAAGGCCTTCGGGTCGTAAACCCCTGTCAGGAGGGAAGAAGCCCGCAAGGGTGACGGTACCTCCAGAGGAAGCCCCGGCTAACTACGTGCCAGCAGCCGCGGTAAGACGTAGGGGGCGAGCGTTGTCCGGATTCACTGGGCGTAAAGCGCGAGTAGGTGGCCTGTTAAGTGGCGTGTGAAAGCCTGCGGCTCAACCGTAGGAGGTCGCGCCAGACTGGCGGGCTTGAGGGTGGCAGAGGGTGATGGAATTCCCGGTGTAGCGGTGAAATGCGCAGATATCGGGAGGAACGCCAGTGGCGAAGGCGGTCACCTGGGCCATCCCTAACGCTGAGTCGCGAAAGCTGGGGGAGCGAACGGGATTAGATACCCCGGTAGTC
## 4 CCTACGGGAGGCAGCAGTTGGGAATTTTGGGCAATGGGCGCAAGCCTGACCCAGCGACGCCGCGTGGGGGATGAAGGCCTTCGGGTCGTAAACCCCTGTCAGGAGGGAAGAAGCCCGCAAGGGTGACGGTACCTCCAGAGGAAGCCCCGGCTAACTACGTGCCAGCAGCCGCGGTAAGACGTAGGGGGCGAGCGTTGTCCGGATTCACTGGGCGTAAAGCGCGAGTAGGTGGCCTGTTAAGTGGCGTGTGAAAGCCTGCGGCTCAACCGTAGGAGGTCGCGCCAGACTGGCGGGCTTGAGGGTGGCAGAGGGTGATGGAATTCCCGGTGTAGCGGTGAAATGCGCAGATATCGGGAGGAACGCCAGTGGCGAAGGCGGTCACCTGGGCCATCCCTAACGCTGAGTCGCGAAAGCTGGGGGAGCGAACGGGATTAGATACCCTAGTAGTC
## 5 CCTACGGGAGGCAGCAGTTGGGAATTTTGGGCAATGGGCGCAAGCCTGACCCAGCGACGCCGCGTGGGGGATGAAGGCCTTCGGGTCGTAAACCCCTGTCAGGAGGGAAGAAGCCCGCAAGGGTGACGGTACCTCCAGAGGAAGCCCCGGCTAACTACGTGCCAGCAGCCGCGGTAAGACGTAGGGGGCGAGCGTTGTCCGGATTCACTGGGCGTAAAGCGCGAGTAGGTGGCCTGTTAAGTGGCGTGTGAAAGCCTGCGGCTCAACCGTAGGAGGTCGCGCCAGACTGGCGGGCTTGAGGGTGGCAGAGGGTGATGGAATTCCCGGTGTAGCGGTGAAATGCGCAGATATCGGGAGGAACGCCAGTGGCGAAGGCGGTCACCTGGGCCATCCCTAACGCTGAGTCGCGAAAGCTGGGGGAGCGAACGGGATTAGATACCCCTGTAGTC
## 6 CCTACGGGAGGCAGCAGTTGGGAATTTTGGGCAATGGGCGCAAGCCTGACCCAGCGACGCCGCGTGGGGGATGAAGGCCTTCGGGTCGTAAACCCCTGTCAGGAGGGAAGAAGCCCGCAAGGGTGACGGTACCTCCAGAGGAAGCCCCGGCTAACTACGTGCCAGCAGCCGCGGTAAGACGTAGGGGGCGAGCGTTGTCCGGATTCACTGGGCGTAAAGCGCGAGTAGGTGGCCTGTTAAGTGGCGTGTGAAAGCCTGCGGCTCAACCGTAGGAGGTCGCGCCAGACTGGCGGGCTTGAGGGTGGCAGAGGGTGATGGAATTCCCGGTGTAGCGGTGAAATGCGCAGATATCGGGAGGAACGCCAGTGGCGAAGGCGGTCACCTGGGCCATCCCTAACGCTGAGTCGCGAAAGCTGGGGGAGCGAACGGGATTAGATACCCGGGTAGTC
##    Kingdom Phylum Class Order Family Genus Species
## 1 Bacteria  GAL15  <NA>  <NA>   <NA>  <NA>    <NA>
## 2 Bacteria  GAL15  <NA>  <NA>   <NA>  <NA>    <NA>
## 3 Bacteria  GAL15  <NA>  <NA>   <NA>  <NA>    <NA>
## 4 Bacteria  GAL15  <NA>  <NA>   <NA>  <NA>    <NA>
## 5 Bacteria  GAL15  <NA>  <NA>   <NA>  <NA>    <NA>
## 6 Bacteria  GAL15  <NA>  <NA>   <NA>  <NA>    <NA>
# intution check 
stopifnot(new_tax_tab$ASVseqs == colnames(noChimeras_ASV_table))

# Now let's add the ASV names 
rownames(new_tax_tab) <- rownames(asv_tab)
head(new_tax_tab)
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                 ASVseqs
## ASV_1 CCTACGGGAGGCAGCAGTTGGGAATTTTGGGCAATGGGCGCAAGCCTGACCCAGCGACGCCGCGTGGGGGATGAAGGCCTTCGGGTCGTAAACCCCTGTCAGGAGGGAAGAAGCCCGCAAGGGTGACGGTACCTCCAGAGGAAGCCCCGGCTAACTACGTGCCAGCAGCCGCGGTAAGACGTAGGGGGCGAGCGTTGTCCGGATTCACTGGGCGTAAAGCGCGAGTAGGTGGCCTGTTAAGTGGCGTGTGAAAGCCTGCGGCTCAACCGTAGGAGGTCGCGCCAGACTGGCGGGCTTGAGGGTGGCAGAGGGTGATGGAATTCCCGGTGTAGCGGTGAAATGCGCAGATATCGGGAGGAACGCCAGTGGCGAAGGCGGTCACCTGGGCCATCCCTAACGCTGAGTCGCGAAAGCTGGGGGAGCGAACGGGATTAGATACCCCAGTAGTC
## ASV_2 CCTACGGGAGGCAGCAGTTGGGAATTTTGGGCAATGGGCGCAAGCCTGACCCAGCGACGCCGCGTGGGGGATGAAGGCCTTCGGGTCGTAAACCCCTGTCAGGAGGGAAGAAGCCCGCAAGGGTGACGGTACCTCCAGAGGAAGCCCCGGCTAACTACGTGCCAGCAGCCGCGGTAAGACGTAGGGGGCGAGCGTTGTCCGGATTCACTGGGCGTAAAGCGCGAGTAGGTGGCCTGTTAAGTGGCGTGTGAAAGCCTGCGGCTCAACCGTAGGAGGTCGCGCCAGACTGGCGGGCTTGAGGGTGGCAGAGGGTGATGGAATTCCCGGTGTAGCGGTGAAATGCGCAGATATCGGGAGGAACGCCAGTGGCGAAGGCGGTCACCTGGGCCATCCCTAACGCTGAGTCGCGAAAGCTGGGGGAGCGAACGGGATTAGATACCCGAGTAGTC
## ASV_3 CCTACGGGAGGCAGCAGTTGGGAATTTTGGGCAATGGGCGCAAGCCTGACCCAGCGACGCCGCGTGGGGGATGAAGGCCTTCGGGTCGTAAACCCCTGTCAGGAGGGAAGAAGCCCGCAAGGGTGACGGTACCTCCAGAGGAAGCCCCGGCTAACTACGTGCCAGCAGCCGCGGTAAGACGTAGGGGGCGAGCGTTGTCCGGATTCACTGGGCGTAAAGCGCGAGTAGGTGGCCTGTTAAGTGGCGTGTGAAAGCCTGCGGCTCAACCGTAGGAGGTCGCGCCAGACTGGCGGGCTTGAGGGTGGCAGAGGGTGATGGAATTCCCGGTGTAGCGGTGAAATGCGCAGATATCGGGAGGAACGCCAGTGGCGAAGGCGGTCACCTGGGCCATCCCTAACGCTGAGTCGCGAAAGCTGGGGGAGCGAACGGGATTAGATACCCCGGTAGTC
## ASV_4 CCTACGGGAGGCAGCAGTTGGGAATTTTGGGCAATGGGCGCAAGCCTGACCCAGCGACGCCGCGTGGGGGATGAAGGCCTTCGGGTCGTAAACCCCTGTCAGGAGGGAAGAAGCCCGCAAGGGTGACGGTACCTCCAGAGGAAGCCCCGGCTAACTACGTGCCAGCAGCCGCGGTAAGACGTAGGGGGCGAGCGTTGTCCGGATTCACTGGGCGTAAAGCGCGAGTAGGTGGCCTGTTAAGTGGCGTGTGAAAGCCTGCGGCTCAACCGTAGGAGGTCGCGCCAGACTGGCGGGCTTGAGGGTGGCAGAGGGTGATGGAATTCCCGGTGTAGCGGTGAAATGCGCAGATATCGGGAGGAACGCCAGTGGCGAAGGCGGTCACCTGGGCCATCCCTAACGCTGAGTCGCGAAAGCTGGGGGAGCGAACGGGATTAGATACCCTAGTAGTC
## ASV_5 CCTACGGGAGGCAGCAGTTGGGAATTTTGGGCAATGGGCGCAAGCCTGACCCAGCGACGCCGCGTGGGGGATGAAGGCCTTCGGGTCGTAAACCCCTGTCAGGAGGGAAGAAGCCCGCAAGGGTGACGGTACCTCCAGAGGAAGCCCCGGCTAACTACGTGCCAGCAGCCGCGGTAAGACGTAGGGGGCGAGCGTTGTCCGGATTCACTGGGCGTAAAGCGCGAGTAGGTGGCCTGTTAAGTGGCGTGTGAAAGCCTGCGGCTCAACCGTAGGAGGTCGCGCCAGACTGGCGGGCTTGAGGGTGGCAGAGGGTGATGGAATTCCCGGTGTAGCGGTGAAATGCGCAGATATCGGGAGGAACGCCAGTGGCGAAGGCGGTCACCTGGGCCATCCCTAACGCTGAGTCGCGAAAGCTGGGGGAGCGAACGGGATTAGATACCCCTGTAGTC
## ASV_6 CCTACGGGAGGCAGCAGTTGGGAATTTTGGGCAATGGGCGCAAGCCTGACCCAGCGACGCCGCGTGGGGGATGAAGGCCTTCGGGTCGTAAACCCCTGTCAGGAGGGAAGAAGCCCGCAAGGGTGACGGTACCTCCAGAGGAAGCCCCGGCTAACTACGTGCCAGCAGCCGCGGTAAGACGTAGGGGGCGAGCGTTGTCCGGATTCACTGGGCGTAAAGCGCGAGTAGGTGGCCTGTTAAGTGGCGTGTGAAAGCCTGCGGCTCAACCGTAGGAGGTCGCGCCAGACTGGCGGGCTTGAGGGTGGCAGAGGGTGATGGAATTCCCGGTGTAGCGGTGAAATGCGCAGATATCGGGAGGAACGCCAGTGGCGAAGGCGGTCACCTGGGCCATCCCTAACGCTGAGTCGCGAAAGCTGGGGGAGCGAACGGGATTAGATACCCGGGTAGTC
##        Kingdom Phylum Class Order Family Genus Species
## ASV_1 Bacteria  GAL15  <NA>  <NA>   <NA>  <NA>    <NA>
## ASV_2 Bacteria  GAL15  <NA>  <NA>   <NA>  <NA>    <NA>
## ASV_3 Bacteria  GAL15  <NA>  <NA>   <NA>  <NA>    <NA>
## ASV_4 Bacteria  GAL15  <NA>  <NA>   <NA>  <NA>    <NA>
## ASV_5 Bacteria  GAL15  <NA>  <NA>   <NA>  <NA>    <NA>
## ASV_6 Bacteria  GAL15  <NA>  <NA>   <NA>  <NA>    <NA>
### Final prep of tax table. Add new column with ASV names 
asv_tax <- 
  new_tax_tab %>%
  # add rownames from count table for phyloseq handoff
  mutate(ASV = rownames(asv_tab)) %>%
  # Resort the columns with select
  dplyr::select(Kingdom, Phylum, Class, Order, Family, Genus, Species, ASV, ASVseqs)

head(asv_tax)
##        Kingdom Phylum Class Order Family Genus Species   ASV
## ASV_1 Bacteria  GAL15  <NA>  <NA>   <NA>  <NA>    <NA> ASV_1
## ASV_2 Bacteria  GAL15  <NA>  <NA>   <NA>  <NA>    <NA> ASV_2
## ASV_3 Bacteria  GAL15  <NA>  <NA>   <NA>  <NA>    <NA> ASV_3
## ASV_4 Bacteria  GAL15  <NA>  <NA>   <NA>  <NA>    <NA> ASV_4
## ASV_5 Bacteria  GAL15  <NA>  <NA>   <NA>  <NA>    <NA> ASV_5
## ASV_6 Bacteria  GAL15  <NA>  <NA>   <NA>  <NA>    <NA> ASV_6
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                 ASVseqs
## ASV_1 CCTACGGGAGGCAGCAGTTGGGAATTTTGGGCAATGGGCGCAAGCCTGACCCAGCGACGCCGCGTGGGGGATGAAGGCCTTCGGGTCGTAAACCCCTGTCAGGAGGGAAGAAGCCCGCAAGGGTGACGGTACCTCCAGAGGAAGCCCCGGCTAACTACGTGCCAGCAGCCGCGGTAAGACGTAGGGGGCGAGCGTTGTCCGGATTCACTGGGCGTAAAGCGCGAGTAGGTGGCCTGTTAAGTGGCGTGTGAAAGCCTGCGGCTCAACCGTAGGAGGTCGCGCCAGACTGGCGGGCTTGAGGGTGGCAGAGGGTGATGGAATTCCCGGTGTAGCGGTGAAATGCGCAGATATCGGGAGGAACGCCAGTGGCGAAGGCGGTCACCTGGGCCATCCCTAACGCTGAGTCGCGAAAGCTGGGGGAGCGAACGGGATTAGATACCCCAGTAGTC
## ASV_2 CCTACGGGAGGCAGCAGTTGGGAATTTTGGGCAATGGGCGCAAGCCTGACCCAGCGACGCCGCGTGGGGGATGAAGGCCTTCGGGTCGTAAACCCCTGTCAGGAGGGAAGAAGCCCGCAAGGGTGACGGTACCTCCAGAGGAAGCCCCGGCTAACTACGTGCCAGCAGCCGCGGTAAGACGTAGGGGGCGAGCGTTGTCCGGATTCACTGGGCGTAAAGCGCGAGTAGGTGGCCTGTTAAGTGGCGTGTGAAAGCCTGCGGCTCAACCGTAGGAGGTCGCGCCAGACTGGCGGGCTTGAGGGTGGCAGAGGGTGATGGAATTCCCGGTGTAGCGGTGAAATGCGCAGATATCGGGAGGAACGCCAGTGGCGAAGGCGGTCACCTGGGCCATCCCTAACGCTGAGTCGCGAAAGCTGGGGGAGCGAACGGGATTAGATACCCGAGTAGTC
## ASV_3 CCTACGGGAGGCAGCAGTTGGGAATTTTGGGCAATGGGCGCAAGCCTGACCCAGCGACGCCGCGTGGGGGATGAAGGCCTTCGGGTCGTAAACCCCTGTCAGGAGGGAAGAAGCCCGCAAGGGTGACGGTACCTCCAGAGGAAGCCCCGGCTAACTACGTGCCAGCAGCCGCGGTAAGACGTAGGGGGCGAGCGTTGTCCGGATTCACTGGGCGTAAAGCGCGAGTAGGTGGCCTGTTAAGTGGCGTGTGAAAGCCTGCGGCTCAACCGTAGGAGGTCGCGCCAGACTGGCGGGCTTGAGGGTGGCAGAGGGTGATGGAATTCCCGGTGTAGCGGTGAAATGCGCAGATATCGGGAGGAACGCCAGTGGCGAAGGCGGTCACCTGGGCCATCCCTAACGCTGAGTCGCGAAAGCTGGGGGAGCGAACGGGATTAGATACCCCGGTAGTC
## ASV_4 CCTACGGGAGGCAGCAGTTGGGAATTTTGGGCAATGGGCGCAAGCCTGACCCAGCGACGCCGCGTGGGGGATGAAGGCCTTCGGGTCGTAAACCCCTGTCAGGAGGGAAGAAGCCCGCAAGGGTGACGGTACCTCCAGAGGAAGCCCCGGCTAACTACGTGCCAGCAGCCGCGGTAAGACGTAGGGGGCGAGCGTTGTCCGGATTCACTGGGCGTAAAGCGCGAGTAGGTGGCCTGTTAAGTGGCGTGTGAAAGCCTGCGGCTCAACCGTAGGAGGTCGCGCCAGACTGGCGGGCTTGAGGGTGGCAGAGGGTGATGGAATTCCCGGTGTAGCGGTGAAATGCGCAGATATCGGGAGGAACGCCAGTGGCGAAGGCGGTCACCTGGGCCATCCCTAACGCTGAGTCGCGAAAGCTGGGGGAGCGAACGGGATTAGATACCCTAGTAGTC
## ASV_5 CCTACGGGAGGCAGCAGTTGGGAATTTTGGGCAATGGGCGCAAGCCTGACCCAGCGACGCCGCGTGGGGGATGAAGGCCTTCGGGTCGTAAACCCCTGTCAGGAGGGAAGAAGCCCGCAAGGGTGACGGTACCTCCAGAGGAAGCCCCGGCTAACTACGTGCCAGCAGCCGCGGTAAGACGTAGGGGGCGAGCGTTGTCCGGATTCACTGGGCGTAAAGCGCGAGTAGGTGGCCTGTTAAGTGGCGTGTGAAAGCCTGCGGCTCAACCGTAGGAGGTCGCGCCAGACTGGCGGGCTTGAGGGTGGCAGAGGGTGATGGAATTCCCGGTGTAGCGGTGAAATGCGCAGATATCGGGAGGAACGCCAGTGGCGAAGGCGGTCACCTGGGCCATCCCTAACGCTGAGTCGCGAAAGCTGGGGGAGCGAACGGGATTAGATACCCCTGTAGTC
## ASV_6 CCTACGGGAGGCAGCAGTTGGGAATTTTGGGCAATGGGCGCAAGCCTGACCCAGCGACGCCGCGTGGGGGATGAAGGCCTTCGGGTCGTAAACCCCTGTCAGGAGGGAAGAAGCCCGCAAGGGTGACGGTACCTCCAGAGGAAGCCCCGGCTAACTACGTGCCAGCAGCCGCGGTAAGACGTAGGGGGCGAGCGTTGTCCGGATTCACTGGGCGTAAAGCGCGAGTAGGTGGCCTGTTAAGTGGCGTGTGAAAGCCTGCGGCTCAACCGTAGGAGGTCGCGCCAGACTGGCGGGCTTGAGGGTGGCAGAGGGTGATGGAATTCCCGGTGTAGCGGTGAAATGCGCAGATATCGGGAGGAACGCCAGTGGCGAAGGCGGTCACCTGGGCCATCCCTAACGCTGAGTCGCGAAAGCTGGGGGAGCGAACGGGATTAGATACCCGGGTAGTC
# Intution check
stopifnot(asv_tax$ASV == rownames(asv_tax), rownames(asv_tax) == rownames(asv_tab))

Write 01_DADA2 files

Now, we will write the files! We will write the following to the data/01_DADA2/ folder. We will save both as files that could be submitted as supplements AND as .RData objects for easy loading into the next steps into R.:

  1. ASV_counts.tsv: ASV count table that has ASV names that are re-written and shortened headers like ASV_1, ASV_2, etc, which will match the names in our fasta file below. This will also be saved as data/01_DADA2/ASV_counts.RData.
  2. ASV_counts_withSeqNames.tsv: This is generated with the data object in this file known as noChimeras_ASV_table. ASV headers include the entire ASV sequence ~450ishbps. In addition, we will save this as a .RData object as data/01_DADA2/noChimeras_ASV_table.RData as we will use this data in analysis/02_Taxonomic_Assignment.Rmd to assign the taxonomy from the sequence headers.
  3. ASVs.fasta: A fasta file output of the ASV names from ASV_counts.tsv and the sequences from the ASVs in ASV_counts_withSeqNames.tsv. A fasta file that we can use to build a tree for phylogenetic analyses (e.g. phylogenetic alpha diversity metrics or UNIFRAC dissimilarty).
  4. We will also make a copy of ASVs.fasta in data/02_TaxAss_FreshTrain/ to be used for the taxonomy classification in the next step in the workflow.
  5. Write out the taxonomy table
  6. track_read_counts.RData: To track how many reads we lost throughout our workflow that could be used and plotted later. We will add this to the metadata in analysis/02_Taxonomic_Assignment.Rmd.
# FIRST, we will save our output as regular files, which will be useful later on. 
# Save to regular .tsv file 
# Write BOTH the modified and unmodified ASV tables to a file!
# Write count table with ASV numbered names (e.g. ASV_1, ASV_2, etc)
write.table(asv_tab, "/local/workdir/sna49/moon_milk/moonmilk/data/01_DADA2/ASV_counts.tsv", sep = "\t", quote = FALSE, col.names = NA)
# Write count table with ASV sequence names
write.table(noChimeras_ASV_table, "/local/workdir/sna49/moon_milk/moonmilk/data/01_DADA2/ASV_counts_withSeqNames.tsv", sep = "\t", quote = FALSE, col.names = NA)
# Write out the fasta file for reference later on for what seq matches what ASV
asv_fasta <- c(rbind(asv_headers, asv_seqs))
# Save to a file!
write(asv_fasta, "data/01_DADA2/ASVs.fasta")


# SECOND, let's save the taxonomy tables 
# Write the table 
write.table(asv_tax, "/local/workdir/sna49/moon_milk/moonmilk/data/01_DADA2/ASV_taxonomy.tsv", sep = "\t", quote = FALSE, col.names = NA)


# THIRD, let's save to a RData object 
# Each of these files will be used in the analysis/02_Taxonomic_Assignment
# RData objects are for easy loading :) 
save(noChimeras_ASV_table, file = "/local/workdir/sna49/moon_milk/moonmilk/data/01_DADA2/noChimeras_ASV_table.RData")
save(asv_tab, file = "/local/workdir/sna49/moon_milk/moonmilk/data/01_DADA2/ASV_counts.RData")
# And save the track_counts_df a R object, which we will merge with metadata information in the next step of the analysis in nalysis/02_Taxonomic_Assignment. 
save(track_counts_df, file = "/local/workdir/sna49/moon_milk/moonmilk/data/01_DADA2/track_read_counts.RData")

Session Information